{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import pandas as pd\n",
    "import cPickle\n",
    "from collections import defaultdict\n",
    "import re"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Load train data and print its shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(25000, 3)"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data_train = pd.read_csv('DATA/labeledTrainData.tsv', sep='\\t')\n",
    "data_train.shape"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Load test data and print its shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(25000, 2)"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data_test = pd.read_csv('DATA/testData.tsv', sep='\\t')\n",
    "data_test.shape"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Define pre-processing functions"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def build_data_train_test(data_train, data_test, train_ratio = 0.8, clean_string=True):\n",
    "    \"\"\"\n",
    "    Loads data and split into train and test sets.\n",
    "    \"\"\"\n",
    "    revs = []\n",
    "    vocab = defaultdict(float)\n",
    "    # Pre-process train data set\n",
    "    for i in xrange(data_train.shape[0]):\n",
    "        line = data_train['review'][i]\n",
    "        y = data_train['sentiment'][i]\n",
    "        rev = []\n",
    "        rev.append(line.strip())\n",
    "        if clean_string:\n",
    "            orig_rev = clean_str(' '.join(rev))\n",
    "        else:\n",
    "            orig_rev = ' '.join(rev).lower()\n",
    "        words = set(orig_rev.split())\n",
    "        for word in words:\n",
    "            vocab[word] += 1\n",
    "        datum  = {'y': y, \n",
    "                  'text': orig_rev,\n",
    "                  'num_words': len(orig_rev.split()),\n",
    "                  'split': int(np.random.rand() < train_ratio)}\n",
    "        revs.append(datum)\n",
    "        \n",
    "    # Pre-process test data set\n",
    "    for i in xrange(data_test.shape[0]):\n",
    "        line = data_test['review'][i]\n",
    "        rev = []\n",
    "        rev.append(line.strip())\n",
    "        if clean_string:\n",
    "            orig_rev = clean_str(' '.join(rev))\n",
    "        else:\n",
    "            orig_rev = ' '.join(rev).lower()\n",
    "        words = set(orig_rev.split())\n",
    "        for word in words:\n",
    "            vocab[word] += 1\n",
    "        datum  = {'y': -1, \n",
    "                  'text': orig_rev,\n",
    "                  'num_words': len(orig_rev.split()),\n",
    "                  'split': -1}\n",
    "        revs.append(datum)\n",
    "        \n",
    "    return revs, vocab\n",
    "\n",
    "    \n",
    "def get_W(word_vecs, k=300):\n",
    "    \"\"\"\n",
    "    Get word matrix. W[i] is the vector for word indexed by i\n",
    "    \"\"\"\n",
    "    vocab_size = len(word_vecs)\n",
    "    word_idx_map = dict()\n",
    "    W = np.zeros(shape=(vocab_size+1, k), dtype=np.float32)\n",
    "    W[0] = np.zeros(k, dtype=np.float32)\n",
    "    i = 1\n",
    "    for word in word_vecs:\n",
    "        W[i] = word_vecs[word]\n",
    "        word_idx_map[word] = i\n",
    "        i += 1\n",
    "    return W, word_idx_map\n",
    "\n",
    "def load_bin_vec(fname, vocab):\n",
    "    \"\"\"\n",
    "    Loads 300x1 word vecs from Google (Mikolov) word2vec\n",
    "    \"\"\"\n",
    "    word_vecs = {}\n",
    "    with open(fname, 'rb') as f:\n",
    "        header = f.readline()\n",
    "        vocab_size, layer1_size = map(int, header.split())\n",
    "        binary_len = np.dtype('float32').itemsize * layer1_size\n",
    "        for line in xrange(vocab_size):\n",
    "            word = []\n",
    "            while True:\n",
    "                ch = f.read(1)\n",
    "                if ch == ' ':\n",
    "                    word = ''.join(word)\n",
    "                    break\n",
    "                if ch != '\\n':\n",
    "                    word.append(ch)   \n",
    "            if word in vocab:\n",
    "                word_vecs[word] = np.fromstring(f.read(binary_len), dtype='float32')  \n",
    "            else:\n",
    "                f.read(binary_len)\n",
    "    return word_vecs\n",
    "\n",
    "def add_unknown_words(word_vecs, vocab, min_df=1, k=300):\n",
    "    \"\"\"\n",
    "    For words that occur in at least min_df documents, create a separate word vector.    \n",
    "    0.25 is chosen so the unknown vectors have (approximately) same variance as pre-trained ones\n",
    "    \"\"\"\n",
    "    for word in vocab:\n",
    "        if word not in word_vecs and vocab[word] >= min_df:\n",
    "            word_vecs[word] = np.random.uniform(-0.25,0.25,k)  \n",
    "\n",
    "def clean_str(string):\n",
    "    \"\"\"\n",
    "    Tokenization/string cleaning for dataset\n",
    "    Every dataset is lower cased except\n",
    "    \"\"\"\n",
    "    string = re.sub(r\"[^A-Za-z0-9(),!?\\'\\`]\", \" \", string)     \n",
    "    string = re.sub(r\"\\'s\", \" \\'s\", string) \n",
    "    string = re.sub(r\"\\'ve\", \" \\'ve\", string) \n",
    "    string = re.sub(r\"n\\'t\", \" n\\'t\", string) \n",
    "    string = re.sub(r\"\\'re\", \" \\'re\", string) \n",
    "    string = re.sub(r\"\\'d\", \" \\'d\", string) \n",
    "    string = re.sub(r\"\\'ll\", \" \\'ll\", string) \n",
    "    string = re.sub(r\",\", \" , \", string) \n",
    "    string = re.sub(r\"!\", \" ! \", string) \n",
    "    string = re.sub(r\"\\(\", \" \\( \", string) \n",
    "    string = re.sub(r\"\\)\", \" \\) \", string) \n",
    "    string = re.sub(r\"\\?\", \" \\? \", string) \n",
    "    string = re.sub(r\"\\s{2,}\", \" \", string)    \n",
    "    return string.strip().lower()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Pre-process data and save it in Pickle format"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "data loaded!\n",
      "number of sentences: 50000\n",
      "vocab size: 112540\n",
      "max sentence length: 2633\n",
      "loading word2vec vectors... word2vec loaded!\n",
      "num words already in word2vec: 58843\n",
      "dataset created!\n"
     ]
    }
   ],
   "source": [
    "w2v_file = 'GoogleNews-vectors-negative300.bin'\n",
    "\n",
    "revs, vocab = build_data_train_test(data_train, data_test, train_ratio=0.8, clean_string=True)\n",
    "max_l = np.max(pd.DataFrame(revs)['num_words'])\n",
    "print 'data loaded!'\n",
    "print 'number of sentences: ' + str(len(revs))\n",
    "print 'vocab size: ' + str(len(vocab))\n",
    "print 'max sentence length: ' + str(max_l)\n",
    "print 'loading word2vec vectors...',\n",
    "w2v = load_bin_vec(w2v_file, vocab)\n",
    "print 'word2vec loaded!'\n",
    "print 'num words already in word2vec: ' + str(len(w2v))\n",
    "add_unknown_words(w2v, vocab)\n",
    "W, word_idx_map = get_W(w2v)\n",
    "cPickle.dump([revs, W, word_idx_map, vocab], open('imdb-train-val-test.pickle', 'wb'))\n",
    "print 'dataset created!'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 2",
   "language": "python",
   "name": "python2"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 0
}
